

#######################################
## 2. CLUSTER GENERATION B
#######################################


library(fastLink)
library(here)

i_am("2_cluster_generation_b.R")
rm(list=ls())

# set primary directory and folders
destination <- here()
temp.folder <- paste0(destination,'temporary_files/')# This is the folder where you will save temporary files on you machine at different stages of the match process 

# create output folder if doesn't exist
dir.create(paste0(destination,'output/'))

# set working directory
setwd(destination) 

#### load MTO dataset as data.frame
#mto.full <- #<READ MTO DATASET HERE> 


#### let's see what clusters of data we've already generated (useful if restarting from midway through process)
files <- list.files(temp.folder)[!grepl("matched_dat",list.files(temp.folder))]
print(files)

l2.clusters <- c(files[grepl("l2_female", files)],files[grepl("l2_male", files)])
mto.clusters <- files[grepl("mto_clusters",files)]

# to generate new ones
gender.cat <- c('male','female') # possible categories (individuals with 'unknown' gender are repeated in both male and female subsets)
size <- 10000000 # this determines the maximum cluster size. Can be adjusted according to match time/success
mto.clusters <- NULL
cluster.stats <- data.frame(cat=c('male','female'),n.clusters=rep(NA,2)) # holder for cluster summaries
cluster.details <- NULL

# check to make sure we have all the clusters we want, if not make some new ones
if(length(l2.clusters)<2|length(mto.clusters)<2){
  cat("redo cluster determination\n")

  for(cat in gender.cat){
    # start by reading in first names for gender
    load(paste0('./l2_first_names_', cat,'.RData'))
    
    # subset MTO dat to gender category to determine clusters
    mto.sub <- mto.dat[which(mto.dat$gender==substring(cat,1,1)),]
    
    # now determine clusters based on first names of MTO and L2 subsets
    cluster.out <- clusterMatch(vecA = mto.sub$first, vecB = voter.first, max.n=size)
    voter.clusters <- cluster.out$clusterB
    mto.sub$cluster <- cluster.out$clusterA
    
    save(mto.sub,file=paste0(temp.folder,'mto_clusters_',cat,'.RData'))
    cluster.stats$n.clusters[cluster.stats$cat==cat] <- cluster.out$n.clusters
    
    rm(list=c('voter.first','cluster.out')) # clean memory
    
    # load rest of L2 data for matching and save subsets by cluster to reduce pairwise comparisons
    load(paste0('l2_match_vars_',cat,'.RData'))
    
    # save each L2 cluster subset and temporary MTO cluster subset
    for(cluster in 1:length(unique(voter.clusters))){
      cat(cluster)
      voter.sub <- voter.full[voter.clusters==cluster,]
      save(voter.sub, file=paste0(temp.folder,paste('l2',cat,cluster,sep='_'),'.RData'))
      
      # cluster details
      mto.cluster <- data.table(subset(mto.sub, cluster == cluster))
      details <- data.frame(gender=cat,cluster=cluster,l2=nrow(voter.sub),mto=nrow(mto.cluster),
                            letters=paste(unique(substring(voter.sub$first,1,1)), 
                                          collapse="|"))
      cluster.details <- rbind(cluster.details, details)
      cat('done\n')
      rm('voter.sub','mto.cluster')
    }
    cat(cat,'done\n')
  }
  
  # save cluster stats
  print(cluster.stats)
  print(cluster.details)
  save(cluster.stats, file='./output/cluster_stats.RData')
  save(cluster.details, file="./output/cluster_details.RData")
  
}else{
  cluster.details <- NULL
  for(cat in gender.cat){
    
    load(paste0(temp.folder,'mto_clusters_',cat,'.RData'))
    
    ### iterate over clusters
    for(i in 1:length(unique(mto.sub$cluster))){
      #create id for cluster
      id <- paste(cat,i,sep='_')
      
      cat(id,'\n')
      
      # load cluster i
      load(paste0(temp.folder,paste('l2',cat,i,sep='_'),'.RData'))
      
      # subset mto data, make sure everything's in data.table
      mto.cluster <- data.table(subset(mto.sub, cluster == i))
      
      details <- data.frame(gender=cat,cluster=i,l2=nrow(voter.sub),mto=nrow(mto.cluster),
                            letters=paste(unique(substring(voter.sub$first,1,1)), 
                                          collapse="|"))
      cluster.details <- rbind(cluster.details, details)
      cat(id,"done\n")
    }
  }
  
  print(cluster.details)
  save(cluster.details, file="./output/cluster_details.RData")
  
}




